With the everchanging world one can be easily be influenced by thew modern tools being developed around the globe to make our lives better. Data scince is one such tool which is in spot light.
Today every equipment we use is collecting some data to make our lives easier but so much data is being gathered that it become difficult to make sense out of it. One of the resposibilities of a Data Scientist is to present this data in a way that everyone can understand it.
This notebook depicts how data science has spread
across the globe,
across different genders,
across different age groups
and lastly every type of industry.
It also shows how much money various industies are investing in data science.
The analysis of this survey majorly concludes that Data Scince is open not only for veterans but there is also room for new comers to explore teir skills in data science.
#pip install pygal
import pandas as pd
import numpy as np
import matplotlib as plt
import pygal
import seaborn as sns
Importing the data
df = pd.read_csv('kaggle_survey_2022_responses.csv')
df.shape
C:\Users\rishi\AppData\Local\Temp\ipykernel_21880\3469635955.py:1: DtypeWarning: Columns (0,15,43,57,73,88,104,118,126,132,170,200,208,215,225,248,255,257,260,270,271,272,277,281,294) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv('kaggle_survey_2022_responses.csv')
(23998, 296)
col = df.iloc[0,:]
null = df.isna().sum()
tuple_null = list(zip(col,null))
#tuple_null #Since there column names are really long, i decided to use the the Tuple
df_null = pd.DataFrame(tuple_null, columns = ['column','null_sum'])
df_null = df_null.assign(null_percentage = lambda x: x.null_sum/len(df)*100)
df_null
| column | null_sum | null_percentage | |
|---|---|---|---|
| 0 | Duration (in seconds) | 0 | 0.000000 |
| 1 | What is your age (# years)? | 0 | 0.000000 |
| 2 | What is your gender? - Selected Choice | 0 | 0.000000 |
| 3 | In which country do you currently reside? | 0 | 0.000000 |
| 4 | Are you currently a student? (high school, uni... | 0 | 0.000000 |
| ... | ... | ... | ... |
| 291 | Who/what are your favorite media sources that ... | 16231 | 67.634803 |
| 292 | Who/what are your favorite media sources that ... | 20193 | 84.144512 |
| 293 | Who/what are your favorite media sources that ... | 22271 | 92.803567 |
| 294 | Who/what are your favorite media sources that ... | 23997 | 99.995833 |
| 295 | Who/what are your favorite media sources that ... | 23162 | 96.516376 |
296 rows × 3 columns
df_drop = df_null.loc[df_null['null_percentage'] > 80]
df_d = df_drop.iloc[:,0].tolist()
Changing the header and droping the null values
df_2 = pd.read_csv('kaggle_survey_2022_responses.csv',header=1)
df_new = df_2.drop(columns=df_d, axis = 1)
C:\Users\rishi\AppData\Local\Temp\ipykernel_21880\832517846.py:1: DtypeWarning: Columns (208,225,255,257,260,270,271,277) have mixed types. Specify dtype option on import or set low_memory=False.
df_2 = pd.read_csv('kaggle_survey_2022_responses.csv',header=1)
print(f'\n Old null = {df.isna().sum().sum()/(df.shape[0]*df.shape[1])*100:.2f}% \n New Null = {df_new.isna().sum().sum()/(df.shape[0]*df.shape[1])*100:.2f}%')
Old null = 88.38% New Null = 8.95%
col = df.iloc[0,:]
df_n = df_new
rename_dict = {'Duration (in seconds)':'duration',
'What is your age (# years)?':'age',
'What is your gender? - Selected Choice':'sex',
'In which country do you currently reside?':'country',
'Are you currently a student? (high school, university, or graduate)':'education'}
df_n.rename(columns=rename_dict, inplace = True)
rename_dict2 = {'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Coursera':'coursera',
'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Kaggle Learn Courses':'kaggle',
'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Udemy': 'udemy',
'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - University Courses (resulting in a university degree)':'university',
'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Other':'other'}
df_n.rename(columns=rename_dict2, inplace = True)
col_n = df_n.columns[0:15]
Using a for loop to modify dataframe values
for i in df_n.columns[7:10]:
df_n[i] = df_n[i].fillna(0)
df_n[i] = df_n[i].apply(lambda x: 1 if x != 0 else 0)
col_n = df_n.columns[10:14]
for i in df_n.columns[10:14]:
df_n.rename(columns={i:'useful_platform'}, inplace = True)
Exporting the modified dataframe to a csv
df.to_csv('kaggle_survey_2022_responses_v2.csv')
Importing modified dataframe
df = pd.read_csv('kaggle_survey_2022_responses_v2.csv')
col_default = (df.columns)
df = df.fillna(0)
col = df.columns
column_names = ['platform_university','platform_online','platform_videos','platform_kaggle']
Renaming columns header
j = 0
for i in df.columns[10:14]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
df.rename(columns={df.columns[14]:'further_education'}, inplace = True)
df.rename(columns={df.columns[15]:'paper_published'}, inplace = True)
df.rename(columns={df.columns[16]:'coading_xp'}, inplace = True)
column_names = ['regularL_pthon','regularL_sql']
j = 0
for i in df.columns[17:19]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
column_names = ['IDE_used_jupyterlab','IDE_used_VSCode','IDE_used_pycharm','IDE_used_jupyterNotebook']
j = 0
for i in df.columns[19:23]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
# column_names = ['notebook_used_kaggle','notebook_used_colab','notebook_used_Non','visualization_used_matplotlib','visualization_used_seaborn','visualization_used_plotly']
column_names = ['notebook_used_kaggle','notebook_used_colab','notebook_used_Non',
'visualization_used_matplotlib','visualization_used_seaborn','visualization_used_plotly']
j = 0
for i in df.columns[23:29]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
df.rename(columns={df.columns[29]:'MachineLearning_xp'}, inplace = True)
Changing row values
df['MachineLearning_xp'] = df['MachineLearning_xp'].apply(lambda x: 10 if x == '5-10 years' else 2 if x == '1-2 years' else 3 if x == '2-3 years' else x)
df['MachineLearning_xp'] = df['MachineLearning_xp'].apply(lambda x: 4 if x == '3-4 years' else 5 if x == '4-5 years' else 15 if x == '10-20 years' else 20 if x == '20 or more years' else 0.5 if x == 'I do not use machine learning methods' else x)
column_names = ['ML_framework_ScikitLearn','ML_framework_TensorFlow','ML_framework_Keras','ML_framework_PyTorch',
'ML_algorithms_LinearRegression','ML_algorithms_RandomForest','ML_algorithms_GradientBoosting','ML_algorithms_ConvolutionalNeuralNetworks']
Using a for loop to change other columns header
j = 0
for i in df.columns[30:38]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
column_names = ['Pre_trained_ModelWeight_Kaggle','Pre_trained_ModelWeight_NotUsing','job_title','Industry','company_size','DS_employee_strength','ML_incorporated',
'employeer_WorkActivity','yearly_compensation','Personal_expenditure_forML/CloudComputing',
'media_source_forDS_kaggle','media_source_forDS_youtube','media_source_forDS_blogs']
j = 0
for i in df.columns[38:51]:
df.rename(columns={i:column_names[j]}, inplace = True)
j = j+1
Exporting modified dataframe to csv
df.to_csv('kaggle_survey_2022_responses_v3.csv')
Reading MOdified Dataframe
df = pd.read_csv('kaggle_survey_2022_responses_v3.csv')
df = df.drop(['Unnamed: 0'], axis =1)
Changing row values for columns where unique values are exactly two
j = []
a = []
for i in df.columns:
a.append(df[i].unique().size)
j.append(i)
tuple_uq = list(zip(j,a))
df_uq = pd.DataFrame(data = tuple_uq , columns=['column','unique'])
del j, a, i, tuple_uq
d_type = df.dtypes
j = df.columns[11:]
for i in df.columns[11:]:
if df[i].unique().size == 2:
df[i] = df[i].apply(lambda x: 0 if x == '0' else 1)
del i , j
df['platform_university'] = df['platform_university'].apply(lambda x: 0 if x == '0' else 1)
Changing Column name
df = df[['duration', 'age', 'sex', 'country', 'EDUCATION', 'coursera', 'kaggle', 'udemy', 'university', 'other', 'platform_university', 'platform_online', 'platform_videos', 'platform_kaggle', 'further_education', 'paper_published', 'coading_xp', 'regularL_pthon', 'regularL_sql', 'IDE_used_jupyterlab', 'IDE_used_VSCode', 'IDE_used_pycharm', 'IDE_used_jupyterNotebook', 'notebook_used_kaggle', 'notebook_used_colab', 'notebook_used_Non', 'visualization_used_matplotlib', 'visualization_used_seaborn', 'visualization_used_plotly', 'MachineLearning_xp', 'ML_framework_ScikitLearn', 'ML_framework_TensorFlow', 'ML_framework_Keras', 'ML_framework_PyTorch', 'ML_algorithms_LinearRegression', 'ML_algorithms_RandomForest', 'ML_algorithms_GradientBoosting', 'ML_algorithms_ConvolutionalNeuralNetworks', 'Pre_trained_ModelWeight_Kaggle', 'Pre_trained_ModelWeight_NotUsing', 'job_title', 'Industry', 'company_size', 'DS_employee_strength', 'ML_incorporated', 'employeer_WorkActivity', 'yearly_compensation', 'Personal_expenditure_forML/CloudComputing', 'media_source_forDS_kaggle', 'media_source_forDS_youtube', 'media_source_forDS_blogs']]
Exporting Dataframe to csv file
df.to_csv('kaggle_survey_2022_responses_v5.csv')
Importing and modifying the dataframe
df = pd.read_csv('kaggle_survey_2022_responses_v5.csv')
df = df.drop('Unnamed: 0',axis=1)
df['EDUCATION'] = df['EDUCATION'].fillna('Yes')
df['EDUCATION'] = df['EDUCATION'].replace(['Yes','No'],[1,0])
df = df.reset_index()
df.nunique()
index 23997 duration 3529 age 11 sex 5 country 58 EDUCATION 2 coursera 2 kaggle 2 udemy 2 university 2 other 2 platform_university 2 platform_online 2 platform_videos 2 platform_kaggle 2 further_education 8 paper_published 3 coading_xp 8 regularL_pthon 2 regularL_sql 2 IDE_used_jupyterlab 2 IDE_used_VSCode 2 IDE_used_pycharm 2 IDE_used_jupyterNotebook 2 notebook_used_kaggle 2 notebook_used_colab 2 notebook_used_Non 2 visualization_used_matplotlib 2 visualization_used_seaborn 2 visualization_used_plotly 2 MachineLearning_xp 10 ML_framework_ScikitLearn 2 ML_framework_TensorFlow 2 ML_framework_Keras 2 ML_framework_PyTorch 2 ML_algorithms_LinearRegression 2 ML_algorithms_RandomForest 2 ML_algorithms_GradientBoosting 2 ML_algorithms_ConvolutionalNeuralNetworks 2 Pre_trained_ModelWeight_Kaggle 2 Pre_trained_ModelWeight_NotUsing 2 job_title 16 Industry 16 company_size 6 DS_employee_strength 7 ML_incorporated 7 employeer_WorkActivity 2 yearly_compensation 27 Personal_expenditure_forML/CloudComputing 7 media_source_forDS_kaggle 2 media_source_forDS_youtube 2 media_source_forDS_blogs 2 dtype: int64
Visualizing Gender ratio Data from dataframe Kaggle
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
cat = df['sex'].value_counts()
ans = cat.index
cat_df = pd.DataFrame(cat)
cat_df = cat_df.reset_index(level=0)
# sum(cat_df.iloc[2:,1])
new_row = {'sex':'others', 'count':sum(cat_df.iloc[2:,1])}
cat_df = pd.concat([cat_df, pd.DataFrame([new_row])], ignore_index=True)
cat_df = cat_df.drop(cat_df.index[2:5])
cat_2 = cat_df.iloc[:,1]
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,5))
colors=['rosybrown', 'moccasin', 'lightyellow']
ax1.pie(cat_2, labels = ['Male','Female','Others'], colors=colors, rotatelabels=180, radius = 1,autopct = '%1.1f%%')
ax2.pie(cat[2:], labels=ans[2:], colors=['lavender','darkseagreen','gold'], rotatelabels=180, radius = 0.5,autopct = '%1.1f%%')
# ax[0, 0].set_title()
plt.title("Gender Ratio amoung \n Data Scientists", bbox={'facecolor':'0.8', 'pad':5},fontweight='bold', x=0, y=1,fontsize=20)
fig.tight_layout()
plt.show()
Visualizing age group Distribution in the Dataframe
df_age = df.sort_values(by=['age'])
age = df['age'].value_counts()
col_age = age.index
colors=['rosybrown', 'moccasin', 'lightyellow', 'lavender','darkseagreen','gold']
plt.pie(age, labels = col_age, colors=colors, rotatelabels=180, radius = 1.6,autopct = '%1.1f%%')
plt.title("Age Groups amoung \n Data Scientists", bbox={'facecolor':'0.8', 'pad':5},fontweight='bold', x=0.85, y=1.25,fontsize=20)
plt.grid(False)
#pip install squarify
import squarify
append_str = 'Age:'
pre_res = [append_str + sub for sub in col_age]
import squarify
import matplotlib
plt.figure(figsize=(14,8))
squarify.plot(sizes=age, label=pre_res, value=age,
color=[matplotlib.cm.Set2(i) for i in range(7)],
text_kwargs={'fontsize': 13})
plt.title('Squarify way to better understand Age Distribution', fontsize=27,fontweight='bold')
plt.axis('off')
plt.show()
df_country = df.sort_values(by=['country'])
country = df['country'].value_counts()
col_country = country.index
plt.figure(figsize=(14,8))
squarify.plot(sizes=country, label=col_country, value=country,
color=[matplotlib.cm.Set2(i) for i in range(7)],
text_kwargs={'fontsize': 5.5})
plt.title('Country Distribution of survey participation', fontsize=27,fontweight='bold')
plt.axis('off')
plt.show()
df = pd.read_csv('kaggle_survey_2022_responses_v5.csv')
df = df.drop('Unnamed: 0',axis=1)
df['EDUCATION'] = df['EDUCATION'].fillna('Yes')
df['EDUCATION'] = df['EDUCATION'].replace(['Yes','No'],[1,0])
Create a dataframe which shows column name and their unique value count
j = []
a = []
for i in df.columns:
a.append(df[i].unique().size)
j.append(i)
tuple_uq = list(zip(j,a))
df_uq = pd.DataFrame(data = tuple_uq , columns=['column','unique'])
del j, a, i, tuple_uq
Creating a dataframe whose column values are greater than 2
a = []
col_2 = []
for i in df.columns[4:]:
if df[i].unique().size > 2:
a.append(df[i].unique())
col_2.append(i)
df_n = pd.DataFrame(data = a, columns = list(range(0,27)))
Deleting extra columns
df_n = df_n.drop(0,axis = 1)
df_n = df_n.transpose()
j = 0
for i in df_n.columns:
df_n.rename(columns={i:col_2[j]}, inplace = True)
j = j+1
del j,i,a,col_2
import seaborn as sns
df_course = df.iloc[:,5:10]
col = df_course.columns.tolist()
df_course['coursera'].sum()
x = []
c = []
for i in df_course.columns:
x.append(df_course[i].sum())
c.append(i)
tup = list(zip(c,x))
df_cor = pd.DataFrame(data = tup , columns=['course','count'])
sns.barplot(x="count", y="course",
data=df_cor,).set(title='Course chosen to study Data Science')
plt.grid(False)
df_course['country'] = df['country']
df_course['country'].unique().size
df_course_g = df_course.groupby('country')
# del df_course_g
df_course_g = df_course.groupby(by=["country"]).sum()
df_course_g.columns
df_course_g['sum'] = df_course_g.apply(lambda row: row.coursera+
row.kaggle+row.udemy+row.university+
row.other, axis = 1)
df_course_g = df_course_g.sort_values(by=['sum'], ascending = False)
#
df_cc_10 = df_course_g[:2]
df_cc_10 = df_cc_10.reset_index(level=0)
df_cc_10['country'].values
#
coursera = df_course_g['coursera']
kaggle = df_course_g['kaggle']
udemy = df_course_g['udemy']
other = df_course_g['other']
university = df_course_g['university']
#
index = df_cc_10['country'].values
graph = pd.DataFrame({'coursera':coursera,'kaggle':kaggle, 'udemy':udemy, 'other':other, 'university':university },
index=index)
ax = graph.plot.barh(stacked=True, title="Pupular Course in Top two Countries")
graph.head(10)
| coursera | kaggle | udemy | other | university | |
|---|---|---|---|---|---|
| India | 3073 | 2197 | 2245 | 2732 | 2105 |
| United States of America | 1543 | 766 | 776 | 459 | 1068 |
df_cc_10_8 = df_course_g[3:10]
df_cc_10_8 = df_cc_10_8.reset_index(level=0)
df_cc_10_8['country'].values
#
coursera = df_course_g['coursera']
kaggle = df_course_g['kaggle']
udemy = df_course_g['udemy']
other = df_course_g['other']
university = df_course_g['university']
#
index1 = df_cc_10_8['country'].values
graph_c = pd.DataFrame({'coursera':coursera,'kaggle':kaggle, 'udemy':udemy, 'other':other, 'university':university },
index=index1)
graph_c.plot.barh(stacked=True, title="Courses Popularity in other countries")
plt.grid(False)
df_ideused = df.iloc[:,20:23]
df_ideused['age'] = df.iloc[:,1]
df_ide_grp = df_ideused.groupby(by='age').sum()
df_ide_grp = df_ide_grp.reset_index(level=0)
df_ide_grp.columns
#
df_ide = df.iloc[:,20:23]
col = df_ide.columns.tolist()
# df_course['coursera'].sum()
x = []
c = []
for i in df_ide.columns:
x.append(df_ide[i].sum())
c.append(i)
tup = list(zip(c,x))
df_ide_used = pd.DataFrame(data = tup , columns=['ide','count'])
sns.barplot(x="count", y="ide", data=df_ide_used).set(title ="Popular IDE")
df_ide_grp.columns
#
VSCode = df_ide_grp['IDE_used_VSCode'].values
pycharm = df_ide_grp['IDE_used_pycharm'].values
jupyterNotebook = df_ide_grp['IDE_used_jupyterNotebook'].values
#
index_v = df_ide_grp['age'].values
graph_v = pd.DataFrame({'pycharm':pycharm,'Jupyter_Notebook':jupyterNotebook, 'VSCode':VSCode },
index=index_v)
ax_nb = graph_v.plot.barh(stacked=True, title="Notebooks amoung Age-Groups")
Exporting data to new dataframe
df.to_csv('kaggle_survey_2022_responses_5_v6.csv')
IMporting dataframe and country_code file
df = pd.read_csv('kaggle_survey_2022_responses_5_v6.csv')
df = df.drop('Unnamed: 0',axis=1)
df_country = df['country']
df_code = pd.read_excel('code.xlsx')
df_country = df_country.reset_index()
df_country['index'] = df_country['index'].apply(lambda x: 1 if x != 0 else 1)
df_country_grp = df_country.groupby(by=["country"]).sum()
df_country_grp = df_country_grp.reset_index()
df_country_grp.rename(columns = {'index':'count'}, inplace = True)
dict_code = dict(zip(df_code.Country,df_code.code))
maping dictionary of code with the Kaggle dataframe column country
df_country_grp["code"] = df_country_grp["country"].map(dict_code)
Manualing adding missing countries to the dataframe
df_na = df_country_grp.loc[df_country_grp["code"].isna()]
update_na = {'Hong Kong (S.A.R.)':'hk', 'Iran, Islamic Republic of...':'ir', 'Russia':'ru', 'South Korea':'tw', 'United States of America':'us', 'South Korea':'kp', 'United Kingdom of Great Britain and Northern Ireland':'gb','Taiwan':'tw'}
dict_code.update(update_na)
# df_country_grp['Match'] = df_country_grp['country'].isin(df_code['country'])
df_country_grp["code"] = df_country_grp["country"].map(dict_code)
df_na = df_country_grp.loc[df_country_grp["code"].isna()]
df_country_grp_new = df_country_grp.dropna(axis = 0)
# dict_count = dict(zip(df_country_grp_new.code,df_country_grp_new.count))
df_na = df_country_grp_new.loc[df_country_grp_new["code"].isna()]
Creating a new dictionary to show country: count of Data scientist
df_map = df_country_grp_new[['code','count']].set_index('code')
m = df_map.to_dict()
Creating a visually appealing world map which highlights the popularity of DATA science in the world
pip install pygal_maps_world
pip install svglib
from pygal_maps_world.maps import World
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF, renderPM
worldmap_chart = World()
import pygal
#worldmap_chart = pygal.maps.world.World()
worldmap_chart.title = 'Data Science Popularity \n according to Kaggle Survey'
worldmap_chart.add('Data_Science',m['count'])
worldmap_chart.render_to_file('World_Data.svg', auto_open=True)
#worldmap_chart.render_to_png('World_Data.png', auto_open=True)
print('Eureka!!')
Eureka!!
df_yr = df['yearly_compensation'].reset_index()
df_yr['count'] = df_yr['index'].apply(lambda x: 1 if x != 0 else 1)
df_yr.drop('index', axis=1, inplace = True)
df_yr_grp = df_yr.groupby(by = 'yearly_compensation').sum()
df_yr_grp = df_yr_grp.reset_index()
df_yr_grp['yearly_compensation'] = df_yr_grp['yearly_compensation'].apply(lambda x: x.replace('$',''))
df_yr_grp['yearly_compensation'] = df_yr_grp['yearly_compensation'].apply(lambda x: x.replace('>',''))
df_yr_grp_n = df_yr_grp[df_yr_grp.yearly_compensation != '0']
df_yr_grp_n = df_yr_grp_n.sort_values(by = ['count'], ascending = False)
df_select = df_yr_grp_n
df_select.sort_values('count', ascending=True, inplace=True)
df_select.reset_index(drop=True, inplace=True)
Yearly Compensation of DATA Scientists
import math
import matplotlib.pyplot as plt
import seaborn as sns
plt.gcf().set_size_inches(8, 8)
sns.set_style('darkgrid')
max_val = max(df_select['count'])
ax = plt.subplot(projection='polar')
list_dist = list(set(df_select['yearly_compensation']))
pal = list(sns.color_palette(palette='Spectral',
n_colors=len(list_dist)).as_hex())
dict_color = dict(zip(list_dist, pal))
for i in range(len(df_select)):
dt_code = df_select['yearly_compensation'][i]
ax.barh(i, df_select['count'][i]*2*np.pi/max_val,
label=dt_code, color=dict_color.get(dt_code))
ax.set_theta_zero_location('N')
ax.set_theta_direction(1)
ax.set_rlabel_position(0)
ax.set_thetagrids([], labels=[])
ax.set_rgrids(range(len(df_select)), labels= df_select['count'])
ax = plt.subplot(projection='polar')
plt.tight_layout()
handles, labels = plt.gca().get_legend_handles_labels()
order = list(range(len(labels)))
order.reverse()
plt.legend([handles[idx] for idx in order],[labels[idx] for idx in order],
bbox_to_anchor=(1.1, 1.01), title='Yearly Compensation')
plt.title('Yearly Compensation for Data Science in US$')
plt.show()
df_post = df[['coading_xp','Industry']]
df_post['count'] = df_post.Industry.apply(lambda x: 1 if x!=0 else 1)
df_post = df_post[df_post.Industry != '0']
df_post_grp = df_post.pivot_table(index=['Industry'], columns = ['coading_xp'], values = ['count'], aggfunc='sum')
df_post_grp_n = df_post_grp.reset_index()
col_post = ['industry','1-3 years', '10-20 years','20+ years','3-5 years','5-10 years','<1 years','no expr']
df_post_grp_n.columns = col_post
# df_post_grp_n = df_post_grp_n.reindex(sorted(df_post_grp_n.columns), axis=1)
df_post_col = df_post_grp_n.columns
df_post_plot = df_post_grp_n[['industry', 'no expr', '<1 years', '1-3 years','3-5 years','5-10 years', '10-20 years','20+ years']]
sns.set(style='white')
df_post_plot.set_index('industry').plot(kind='bar', stacked=True)
plt.title('Data Scientist Experiance distribution in vaious domains')
plt. grid(False)
#df_strength = df_strength [~df_strength ['Industry'].isin(['Computers/Technology','Academics/Education'])]
df_strength = df[['Industry','company_size','DS_employee_strength']]
df_strength = df_strength[df_strength.Industry != '0']
df_strength_edit = df_strength
df_strength_edit.DS_employee_strength.value_counts()
import plotly.express as px
replace_values = {'20+' : 20, '1-2' : 2, '3-4' : 3, '5-9' : 7, '10-14' : 10, '15-19': 16, '0' : 0}
df_strength_edit = df_strength_edit.replace({"DS_employee_strength": replace_values})
fig = px.bar_polar(df_strength_edit, r="company_size", theta="Industry", color="DS_employee_strength", template= 'gridon',
color_discrete_sequence= px.colors.sequential.Plasma_r)
#fig.write_html('DS_Strength.html', auto_open=True)
fig
#pip install -U kaleido
fig.write_image("fig_sector.png")
fig = px.bar(df_strength_edit, x='company_size', y='DS_employee_strength',
color='company_size', text='DS_employee_strength',
color_discrete_map= dict_color,
animation_frame='Industry',
animation_group='Industry',
range_y=[0,20],
labels={ 'DS_employee_strength': 'Employee Count'},
)
fig.update_layout(width=1000, height=600, showlegend=False,
xaxis = dict(tickmode = 'linear', dtick = 1))
fig.update_traces(textfont_size=16, textangle=0)
#fig.write_html('DS_emp_count.svg', auto_open=True)
dataf = pd.read_csv('kaggle_survey_2022_responses.csv')
# displaying chart
dataf.columns.get_loc("Q12_1")
df_reg = dataf.iloc[:,30:45]
df_reg_d = df_reg.dropna(how = 'all')
w =[]
word = ''
s = df_reg_d.columns.tolist()
for i in s:
word = i.split()
w.append(word[-1])
df_reg_d.columns = w
df_reg_d = df_reg_d.drop(0,axis =0)
df_reg_f = df_reg_d.fillna(0)
df_reg_f = df_reg_f.replace(regex={r'\D+': 1})
df_reg_f.dtypes
a = []
b = []
for i in df_reg_f.columns:
a.append(df_reg_f[i].sum())
b.append(i)
tup = list(zip(b,a))
df_reg_n = pd.DataFrame(data = tup , columns=['Regular_Lang','Count'])
df_reg_n = df_reg_n.sort_values(by = ['Count'], ascending= False)
df_reg_n = df_reg_n.reset_index()
df_reg_n = df_reg_n.drop('index', axis = 1)
col_con = ['Python', 'R', 'SQL', 'C', 'C#', 'C++', 'Java', 'Javascript',
'Bash', 'PHP', 'MATLAB', 'Julia', 'Go', 'None', 'Other']
df_reg_n['Regular_Languages'] = col_con
df_reg_n = df_reg_n[['Regular_Languages','Count']]
# del w,i,a,b,word
#
import matplotlib.pyplot as plt
import seaborn
palette_color = seaborn.color_palette('deep')
plt.pie(df_reg_n.iloc[:8,1], labels= df_reg_n.iloc[:8,0], colors=palette_color, autopct='%.0f%%')
plt.title('Plotting Regular Languages used by Data Scietists')
plt. grid(False)
sns.barplot(data=df_reg_n, y="Regular_Languages", x="Count")
plt. grid(False)
df_reg_cnty = dataf.iloc[:,30:45]
df_reg_cnty['country'] = dataf.iloc[:,3]
df_reg_cnty.columns = df_reg_cnty.iloc[0]
w =[]
word = ''
s = df_reg_cnty.columns.tolist()
for i in s:
word = i.split()
w.append(word[-1])
df_reg_cnty.columns = w
del w,word,i,s
df_reg_cnty = df_reg_cnty.drop(0,axis =0)
df_reg_cnty_f = df_reg_cnty.dropna(how = 'all')
df_reg_cnty_f = df_reg_cnty_f.fillna(0)
df_reg_cnty_f = df_reg_cnty_f.iloc[:,:-1].replace(regex={r'\D+': 1})
df_reg_cnty_f['country'] = df_reg_cnty['reside?']
df_reg_grp = df_reg_cnty_f.groupby(by='country').sum()
df_reg_grp = df_reg_grp.reset_index()
df_reg_grp.iloc[0,1:].sum()
df_reg_grp['sum']= df_reg_grp.iloc[:,1:].sum(axis =1)
df_reg_grp = df_reg_grp.sort_values(by = 'sum', ascending = False).reset_index()
df_reg_grp = df_reg_grp.drop('index', axis =1)
df_reg_grp_new = df_reg_grp.iloc[:10,:]
df_reg_grp_new = df_reg_grp_new.replace('United States of America','USA')
df_reg_grp_new.plot(x="country", y=['Python', 'R', 'SQL', 'C', 'C#'], kind="bar")
plt.title('Languages in different Countries')
plt. grid(False)
Summerizing Technical survey results
df_sum = df.iloc[:,10:13]
df_sum = df_sum.join(df.iloc[:,26:29])
df_sum = df_sum.join(df.iloc[:,30:38])
a = []
c = []
df_sum.dtypes
for i in df_sum.columns:
a.append(df_sum.loc[:,i].sum())
c.append(i)
tup = list(zip(c,a))
data_sum = pd.DataFrame(tup, columns=['disc','sum'])
data_sum.iloc[:,0].tolist()
col_d = ['university',
'online',
'videos',
'matplotlib',
'seaborn',
'plotly',
'ScikitLearn',
'TensorFlow',
'Keras',
'PyTorch',
'LinearRegression',
'RandomForest',
'GradientBoosting',
'ConvolutionalNeuralNetworks']
data_sum['disc'] = col_d
Platform Chosen to study Data Science
import seaborn as sns
import matplotlib.pyplot as plt
x1 = data_sum.iloc[0:3,0]
y1 = data_sum.iloc[0:3,1]
colr = sns.color_palette("Paired")
plt.title('platform chosen to \n study DataScience', fontsize=30)
p, q, r = plt.pie(y1, autopct='%1.1f%%', startangle=90, colors=colr, shadow=True, radius= 2, textprops={'fontsize': 20})
plt.legend(p, x1, loc="center", fontsize=15)
plt.show()
Assigning other veriables
x2 = data_sum.iloc[3:6,0]
y2 = data_sum.iloc[3:6,1]
x3 = data_sum.iloc[6:10,0]
y3 = data_sum.iloc[6:10,1]
x4 = data_sum.iloc[10:,0]
y4 = data_sum.iloc[10:,1]
color = sns.color_palette('muted')
plt.figure(figsize = (20,20))
plt.subplot(2,2,1)
plt.title('Visualization Platform', fontsize=30)
p, q, r = plt.pie(y2, autopct='%1.1f%%', colors=color, textprops={'fontsize': 20}, rotatelabels=True, startangle=160)
plt.legend(p, x2, loc="center", fontsize=15)
plt.subplot(2,2,2)
plt.title('Machine Learning Framework', fontsize=30)
p, q, r = plt.pie(y3, autopct='%1.1f%%', colors=color, textprops={'fontsize': 20}, rotatelabels=True, startangle=160)
plt.legend(p, x3, loc="center", fontsize=15)
plt.subplot(2,2,3)
plt.title('Machine Learning Algorithm used',fontsize=30)
p, q, r = plt.pie(y4, autopct='%1.1f%%', colors=color, textprops={'fontsize': 20}, rotatelabels=True, startangle=160)
plt.legend(p, x4, loc="best", fontsize=15)
plt.show()